In [51]:
from collections import Counter
import os
import re
import sys
import time
from cltk.corpus.utils.formatter import assemble_phi5_works_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tag.pos import POSTag
from nltk.tokenize.punkt import PunktLanguageVars
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
In [2]:
def works_texts_list(rm_punctuation, rm_periods):
fps = assemble_phi5_works_filepaths()
curly_comp = re.compile(r'{.+?}')
_list = []
for fp in fps:
with open(fp) as fo:
fr = fo.read()
text = phi5_plaintext_cleanup(fr, rm_punctuation, rm_periods)
text = curly_comp.sub('', text)
_list.append(text)
return _list
In [6]:
t0 = time.time()
text_list = works_texts_list(rm_punctuation=True, rm_periods=True)
print('Total texts', len(text_list))
print('Time to build list of texts: {}'.format(time.time() - t0))
In [5]:
# bag of words/word count
def bow_csv():
t0 = time.time()
vectorizer = CountVectorizer(min_df=1)
column_names = ['wc_' + w for w in vectorizer.get_feature_names()]
term_document_matrix = vectorizer.fit_transform(text_list)
dataframe_bow = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
print('DF BOW shape', dataframe_bow.shape)
fp = os.path.expanduser('~/cltk_data/user_data/bow_latin.csv')
dataframe_bow.to_csv(fp)
print('Time to create BOW vectorizer and write csv: {}'.format(time.time() - t0))
In [6]:
#bow_csv()
In [7]:
# tf-idf
def tfidf_csv():
t0 = time.time()
vectorizer = TfidfVectorizer(min_df=1)
column_names = ['tfidf_' + w for w in vectorizer.get_feature_names()]
term_document_matrix = vectorizer.fit_transform(text_list)
dataframe_tfidf = pd.DataFrame(term_document_matrix.toarray(), columns=column_names)
print('DF tf-idf shape', dataframe_tfidf.shape)
fp = os.path.expanduser('~/cltk_data/user_data/tfidf_latin.csv')
dataframe_tfidf.to_csv(fp)
print('Time to create tf-idf vectorizer and write csv: {}'.format(time.time() - t0))
In [1]:
#tfidf_csv()
In [5]:
# char count
# word count
# sentence
# word count lens
In [12]:
def char_len():
"""Count char len in an input string (doc)."""
t0 = time.time()
char_len = {}
for i, doc in enumerate(text_list):
char_len[i] = pd.Series(len(doc), index=['char_len'])
df_char_len = pd.DataFrame(char_len).transpose()
fp = os.path.expanduser('~/cltk_data/user_data/char_len_latin.csv')
df_char_len.to_csv(fp)
print('Time to create doc len counts: {}'.format(time.time() - t0))
char_len()
In [16]:
def word_count():
"""Count words in an input string (doc)."""
t0 = time.time()
p = PunktLanguageVars()
word_count = {}
for i, doc in enumerate(text_list):
wc_doc = len(p.word_tokenize(doc))
word_count[i] = pd.Series(wc_doc, index=['word_count'])
df_word_count = pd.DataFrame(word_count).transpose()
fp = os.path.expanduser('~/cltk_data/user_data/word_count_lens_latin.csv')
df_word_count.to_csv(fp)
print('Time to create doc word count: {}'.format(time.time() - t0))
word_count()
In [34]:
text_list_no_cleanup = works_texts_list(rm_punctuation=False, rm_periods=False)
In [14]:
text_list_no_cleanup[1][:500]
Out[14]:
In [5]:
# see how sent tokenizer works
s = ' ex scriptis eorum qui ueri arbitrantur . . . neque ipsi eos alii modi esse atque Amilcar dixit, ostendere possunt aliter. antequam Barcha perierat, alii rei causa in Africam missus . . . . . . tantum bellum suscitare conari aduersarios contra bellosum genus. qui cum is ita foedus icistis . . . . . . cum iure sine periculo bellum geri poteratur. qui intellegunt quae fiant, dissentiuntur. Legati quo missi sunt ueniunt, dedicant mandata. Saguntinorum Sempronius Lilybaeo celocem in Africam mittit u'
tokenizer = TokenizeSentence('latin')
sent_tokens = tokenizer.tokenize_sentences(s)
sent_tokens = [s for s in sent_tokens if len(s) > 1] # rm '.' sents
sent_tokens
Out[5]:
In [9]:
def sentence_count():
"""Count sentence in an input string (doc)."""
t0 = time.time()
tokenizer = TokenizeSentence('latin')
word_count = {}
for i, doc in enumerate(text_list_no_cleanup):
sent_tokens = tokenizer.tokenize_sentences(doc)
wc_doc = [s for s in sent_tokens if len(s) > 1]
word_count[i] = pd.Series(, index=['sentence_count'])
df_word_count = pd.DataFrame(word_count).transpose()
fp = os.path.expanduser('~/cltk_data/user_data/sentence_count_lens_latin.csv')
df_word_count.to_csv(fp)
print('Time to create doc word count: {}'.format(time.time() - t0))
sentence_count()
In [43]:
def word_len_counts():
"""Count words lengths in an input string (doc)."""
t0 = time.time()
p = PunktLanguageVars()
word_counts = {}
for i, doc in enumerate(text_list_no_cleanup):
word_tokens = p.word_tokenize(doc)
list_of_counts = ['word_len_' + str(len(w)) for w in word_tokens]
counter_word_counts = Counter(list_of_counts)
word_counts[i] = pd.Series(counter_word_counts, index=counter_word_counts.keys())
df_word_count = pd.DataFrame(word_counts).transpose()
fp = os.path.expanduser('~/cltk_data/user_data/word_count_lens_latin.csv')
df_word_count.to_csv(fp)
print('Time to create doc word count: {}'.format(time.time() - t0))
word_len_counts()
In [50]:
def sentence_word_count():
"""Count words lengths in an input string (doc)."""
t0 = time.time()
tokenizer_sent = TokenizeSentence('latin')
p = PunktLanguageVars()
word_counts = {}
for i, doc in enumerate(text_list_no_cleanup):
list_words_per_sentence = []
sent_tokens = tokenizer_sent.tokenize_sentences(doc)
sent_tokens = [s for s in sent_tokens if len(s) > 1]
for sent in sent_tokens:
word_tokens = p.word_tokenize(sent)
words_in_sent = len(word_tokens)
list_words_per_sentence.append(words_in_sent)
list_of_counts = ['words_in_sent_' + str(count) for count in list_words_per_sentence]
counter_word_counts_per_sents = Counter(list_of_counts)
word_counts[i] = pd.Series(counter_word_counts_per_sents,
index=counter_word_counts_per_sents.keys())
df_word_count_per_sent = pd.DataFrame(word_counts).transpose()
fp = os.path.expanduser('~/cltk_data/user_data/words_per_sent_latin.csv')
df_word_count_per_sent.to_csv(fp)
print('Time to create count of words per sentence: {}'.format(time.time() - t0))
sentence_word_count()
In [ ]:
def pos_counts(index_start=0, index_break=99):
"""Count part of speech input string (doc)."""
t0 = time.time()
tokenizer_sent = TokenizeSentence('latin')
pos_counts = {}
tagger = POSTag('latin')
for i, doc in enumerate(text_list_no_cleanup):
i += index_start
#if i % 1 == 0:
print('Processing doc #{}'.format(i))
pos_tags_list = []
sent_tokens = tokenizer_sent.tokenize_sentences(doc)
sent_tokens = [s for s in sent_tokens if len(s) > 1]
for sent in sent_tokens:
pos_tags = tagger.tag_tnt(sent.lower())
pos_tags = [t[1] for t in pos_tags]
pos_tags_list += pos_tags
pos_counts_counter = Counter(pos_tags_list)
pos_counts[i] = pd.Series(pos_counts_counter, index=pos_counts_counter.keys())
if i == index_break:
print('breaking …')
break
df_pos_counts = pd.DataFrame(pos_counts).transpose()
fp = os.path.expanduser('~/cltk_data/user_data/pos_counts_latin_{}.csv'.format(index_start))
df_pos_counts.to_csv(fp)
print('Time to create count of words per sentence: {}'.format(time.time() - t0))
pos_counts(index_start=0, index_break=99)
#pos_counts(index_start=100)
#pos_counts(index_start=200)
#pos_counts(index_start=300)
#pos_counts(index_start=400)
#pos_counts(index_start=500)
#pos_counts(index_start=600)
#pos_counts(index_start=700)
#pos_counts(index_start=800)
In [ ]: